{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "--2018-11-21 17:55:47--  ftp://ngs.sanger.ac.uk/production/ag1000g/phase1/AR3/variation/crosses/ar3/hdf5/ag1000g.crosses.phase1.ar3sites.2L.h5\n",
      "           => ‘ag1000g.crosses.phase1.ar3sites.2L.h5’\n",
      "Resolving ngs.sanger.ac.uk (ngs.sanger.ac.uk)... 193.62.203.79\n",
      "Connecting to ngs.sanger.ac.uk (ngs.sanger.ac.uk)|193.62.203.79|:21... connected.\n",
      "Logging in as anonymous ... Logged in!\n",
      "==> SYST ... done.    ==> PWD ... done.\n",
      "==> TYPE I ... done.  ==> CWD (1) /production/ag1000g/phase1/AR3/variation/crosses/ar3/hdf5 ... done.\n",
      "==> SIZE ag1000g.crosses.phase1.ar3sites.2L.h5 ... 7959419941\n",
      "==> PASV ... done.    ==> RETR ag1000g.crosses.phase1.ar3sites.2L.h5 ... done.\n",
      "Length: 7959419941 (7.4G) (unauthoritative)\n",
      "\n",
      "ag1000g.crosses.pha 100%[===================>]   7.41G  2.49MB/s    in 48m 26s \n",
      "\n",
      "2018-11-21 18:44:14 (2.61 MB/s) - Control connection closed.\n",
      "Retrying.\n",
      "\n",
      "--2018-11-21 18:59:15--  ftp://ngs.sanger.ac.uk/production/ag1000g/phase1/AR3/variation/crosses/ar3/hdf5/ag1000g.crosses.phase1.ar3sites.2L.h5\n",
      "  (try: 2) => ‘ag1000g.crosses.phase1.ar3sites.2L.h5’\n",
      "Connecting to ngs.sanger.ac.uk (ngs.sanger.ac.uk)|193.62.203.79|:21... connected.\n",
      "Logging in as anonymous ... Logged in!\n",
      "==> SYST ... done.    ==> PWD ... done.\n",
      "==> TYPE I ... done.  ==> CWD (1) /production/ag1000g/phase1/AR3/variation/crosses/ar3/hdf5 ... done.\n",
      "==> SIZE ag1000g.crosses.phase1.ar3sites.2L.h5 ... 7959419941\n",
      "File has already been retrieved.\n",
      "2018-11-21 18:59:17 (0.00 B/s) - ‘ag1000g.crosses.phase1.ar3sites.2L.h5’ saved [7959419941]\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# !wget ftp://ngs.sanger.ac.uk/production/ag1000g/phase1/AR3/variation/crosses/ar3/hdf5/ag1000g.crosses.phase1.ar3sites.3L.h5\n",
    "# !wget ftp://ngs.sanger.ac.uk/production/ag1000g/phase1/AR3/variation/crosses/ar3/hdf5/ag1000g.crosses.phase1.ar3sites.2L.h5\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/tra/anaconda3/lib/python3.6/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n",
      "  from ._conv import register_converters as _register_converters\n"
     ]
    }
   ],
   "source": [
    "import pickle\n",
    "import gzip\n",
    "import random\n",
    "\n",
    "import numpy as np\n",
    "import h5py\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "80\n",
      "['cross-29-2' 'cross-36-9' 'cross-42-4' 'cross-46-9']\n",
      "          id function\n",
      "0   AD0231-C   parent\n",
      "1   AD0232-C   parent\n",
      "2   AD0234-C  progeny\n",
      "3   AD0235-C  progeny\n",
      "4   AD0236-C  progeny\n",
      "5   AD0237-C  progeny\n",
      "6   AD0238-C  progeny\n",
      "7   AD0239-C  progeny\n",
      "8   AD0240-C  progeny\n",
      "9   AD0241-C  progeny\n",
      "10  AD0242-C  progeny\n",
      "11  AD0243-C  progeny\n",
      "12  AD0244-C  progeny\n",
      "13  AD0245-C  progeny\n",
      "14  AD0246-C  progeny\n",
      "15  AD0247-C  progeny\n",
      "16  AD0248-C  progeny\n",
      "17  AD0249-C  progeny\n",
      "18  AD0250-C  progeny\n",
      "19  AD0251-C  progeny\n",
      "20  AD0252-C  progeny\n",
      "21  AD0253-C  progeny\n",
      "22\n",
      "          id       cross sex function\n",
      "0   AD0231-C  cross-29-2   F   parent\n",
      "1   AD0232-C  cross-29-2   M   parent\n",
      "22  AD0254-C  cross-36-9   F   parent\n",
      "23  AD0255-C  cross-36-9   M   parent\n",
      "41  AD0305-C  cross-42-4   F   parent\n",
      "42  AD0306-C  cross-42-4   M   parent\n",
      "57  AD0347-C  cross-46-9   F   parent\n",
      "58  AD0348-C  cross-46-9   M   parent\n"
     ]
    }
   ],
   "source": [
    "samples = pd.read_csv('samples.tsv', sep='\\t')\n",
    "print(len(samples))\n",
    "print(samples['cross'].unique())\n",
    "print(samples[samples['cross'] == 'cross-29-2'][['id', 'function']])\n",
    "print(len(samples[samples['cross'] == 'cross-29-2']))\n",
    "print(samples[samples['function'] == 'parent'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Chromosome arm 3L"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['AD0231-C', 'AD0232-C', 'AD0234-C', 'AD0235-C', 'AD0236-C', 'AD0237-C', 'AD0238-C', 'AD0239-C', 'AD0240-C', 'AD0241-C', 'AD0242-C', 'AD0243-C', 'AD0244-C', 'AD0245-C', 'AD0246-C', 'AD0247-C', 'AD0248-C', 'AD0249-C', 'AD0250-C', 'AD0251-C', 'AD0252-C', 'AD0253-C', 'AD0254-C', 'AD0255-C', 'AD0259-C', 'AD0260-C', 'AD0261-C', 'AD0262-C', 'AD0263-C', 'AD0265-C', 'AD0266-C', 'AD0267-C', 'AD0268-C', 'AD0269-C', 'AD0270-C', 'AD0271-C', 'AD0272-C', 'AD0273-C', 'AD0274-C', 'AD0275-C', 'AD0276-C', 'AD0305-C', 'AD0306-C', 'AD0309-C', 'AD0310-C', 'AD0311-C', 'AD0312-C', 'AD0313-C', 'AD0314-C', 'AD0315-C', 'AD0316-C', 'AD0317-C', 'AD0318-C', 'AD0319-C', 'AD0320-C', 'AD0322-C', 'AD0323-C', 'AD0347-C', 'AD0348-C', 'AD0351-C', 'AD0352-C', 'AD0353-C', 'AD0354-C', 'AD0355-C', 'AD0356-C', 'AD0357-C', 'AD0358-C', 'AD0359-C', 'AD0360-C', 'AD0361-C', 'AD0362-C', 'AD0363-C', 'AD0364-C', 'AD0365-C', 'AD0366-C', 'AD0367-C', 'AD0368-C', 'AD0369-C', 'AD0370-C', 'AD0438-C']\n"
     ]
    }
   ],
   "source": [
    "h5_3L = h5py.File('ag1000g.crosses.phase1.ar3sites.3L.h5', 'r')\n",
    "samples_hdf5 = list(map(lambda sample: sample.decode('utf-8'), h5_3L['/3L/samples']))\n",
    "\n",
    "calldata_genotype = h5_3L['/3L/calldata/genotype']\n",
    "\n",
    "MQ0 = h5_3L['/3L/variants/MQ0']\n",
    "MQ = h5_3L['/3L/variants/MQ']\n",
    "QD = h5_3L['/3L/variants/QD']\n",
    "Coverage = h5_3L['/3L/variants/Coverage']\n",
    "CoverageMQ0 = h5_3L['/3L/variants/CoverageMQ0']\n",
    "HaplotypeScore = h5_3L['/3L/variants/HaplotypeScore']\n",
    "QUAL = h5_3L['/3L/variants/QUAL']\n",
    "FS = h5_3L['/3L/variants/FS']\n",
    "DP = h5_3L['/3L/variants/DP']\n",
    "HRun = h5_3L['/3L/variants/HRun']\n",
    "ReadPosRankSum = h5_3L['/3L/variants/ReadPosRankSum']\n",
    "my_features = {\n",
    "    'MQ': MQ,\n",
    "    'QD': QD,\n",
    "    'Coverage': Coverage,\n",
    "    'HaplotypeScore': HaplotypeScore,\n",
    "    'QUAL': QUAL,\n",
    "    'FS': FS,\n",
    "    'DP': DP,\n",
    "    'HRun': HRun,\n",
    "    'ReadPosRankSum': ReadPosRankSum\n",
    "}\n",
    "\n",
    "num_features = len(my_features)\n",
    "num_alleles = h5_3L['/3L/variants/num_alleles']\n",
    "is_snp = h5_3L['/3L/variants/is_snp']\n",
    "POS = h5_3L['/3L/variants/POS']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "#compute mendelian errors (biallelic)\n",
    "def compute_mendelian_errors(mother, father, offspring):\n",
    "    num_errors = 0\n",
    "    num_ofs_problems = 0\n",
    "    if len(mother.union(father)) == 1:\n",
    "        # Mother and father are homo and the same\n",
    "        for ofs in offspring:\n",
    "            if len(ofs) == 2:\n",
    "                # Offspring is het\n",
    "                num_errors += 1\n",
    "                num_ofs_problems += 1\n",
    "            elif len(ofs.intersection(mother)) == 0:\n",
    "                # Offspring is homo, but opposite from parents\n",
    "                num_errors += 2\n",
    "                num_ofs_problems += 1\n",
    "    elif len(mother) == 1 and len(father) == 1:\n",
    "        # Mother and father are homo and different\n",
    "        for ofs in offspring:\n",
    "            if len(ofs) == 1:\n",
    "                # Homo, should be het\n",
    "                num_errors += 1\n",
    "                num_ofs_problems += 1\n",
    "    elif len(mother) == 2 and len(father) == 2:\n",
    "        # Both are het, individual offspring can be anything\n",
    "        pass\n",
    "    else:\n",
    "        # One is het, the other is homo\n",
    "        homo = mother if len(mother) == 1 else father\n",
    "        for ofs in offspring:\n",
    "            if len(ofs) == 1 and ofs.intersection(homo):\n",
    "                # homo, but not including the allele from parent that is homo\n",
    "                num_errors += 1\n",
    "                num_ofs_problems += 1\n",
    "    return num_errors, num_ofs_problems"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "def acceptable_position_to_genotype():\n",
    "    for i, genotype in enumerate(calldata_genotype):\n",
    "        if is_snp[i] and num_alleles[i] == 2:\n",
    "            if len(np.where(genotype == -1)[0]) > 1:\n",
    "                # Missing data\n",
    "                continue\n",
    "            yield i\n",
    "\n",
    "def acumulate(fun):\n",
    "    acumulator = {}\n",
    "    for res in fun():\n",
    "        if res is not None:\n",
    "            acumulator[res[0]] = res[1]\n",
    "    return acumulator"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_family_indexes(samples_hdf5, cross_pd):\n",
    "    offspring = []\n",
    "    for i, individual in cross_pd.T.iteritems():\n",
    "        index = samples_hdf5.index(individual.id)\n",
    "        if individual.function == 'parent':\n",
    "            if individual.sex == 'M':\n",
    "                father = index\n",
    "            else:\n",
    "                mother = index\n",
    "        else:\n",
    "            offspring.append(index)\n",
    "    return {'mother': mother, 'father': father, 'offspring': offspring}\n",
    "\n",
    "cross_pd = samples[samples['cross'] == 'cross-29-2']\n",
    "family_indexes = get_family_indexes(samples_hdf5, cross_pd)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "7213924\n",
      "11480576\n",
      "14187307\n",
      "18704624\n",
      "25150745\n",
      "27136205\n",
      "32631438\n",
      "34534402\n",
      "36613181\n",
      "38839024\n",
      "40814391\n"
     ]
    }
   ],
   "source": [
    "mother_index = family_indexes['mother']\n",
    "father_index = family_indexes['father']\n",
    "offspring_indexes = family_indexes['offspring']\n",
    "all_errors = {}\n",
    "\n",
    "\n",
    "def get_mendelian_errors():\n",
    "    for i in acceptable_position_to_genotype():\n",
    "        genotype = calldata_genotype[i]\n",
    "        mother = set(genotype[mother_index])\n",
    "        father = set(genotype[father_index])\n",
    "        offspring = [set(genotype[ofs_index]) for ofs_index in offspring_indexes]\n",
    "        my_mendelian_errors = compute_mendelian_errors(mother, father, offspring)\n",
    "        yield POS[i], my_mendelian_errors\n",
    "\n",
    "mendelian_errors = acumulate(get_mendelian_errors)\n",
    "\n",
    "pickle.dump(mendelian_errors, gzip.open('mendelian_errors.pickle.gz', 'wb'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Coverage\n",
      "DP\n",
      "FS\n",
      "HRun\n",
      "HaplotypeScore\n",
      "MQ\n",
      "QD\n",
      "QUAL\n",
      "ReadPosRankSum\n"
     ]
    }
   ],
   "source": [
    "ordered_positions = sorted(mendelian_errors.keys())\n",
    "ordered_features = sorted(my_features.keys())  #XXX on code?\n",
    "num_features = len(ordered_features)\n",
    "feature_fit = np.empty((len(ordered_positions), len(my_features) + 2), dtype=float)\n",
    "\n",
    "for column, feature in enumerate(ordered_features):  # 'Strange' order\n",
    "    print(feature)\n",
    "    current_hdf_row = 0\n",
    "    for row, genomic_position in enumerate(ordered_positions):\n",
    "        while POS[current_hdf_row] < genomic_position:\n",
    "            current_hdf_row +=1\n",
    "        feature_fit[row, column] = my_features[feature][current_hdf_row]\n",
    "\n",
    "for row, genomic_position in enumerate(ordered_positions):\n",
    "    feature_fit[row, num_features] = genomic_position\n",
    "    feature_fit[row, num_features + 1] = 1 if mendelian_errors[genomic_position][0] > 0 else 0\n",
    "\n",
    "np.save(gzip.open('feature_fit.npy.gz', 'wb'), feature_fit, allow_pickle=False, fix_imports=False)\n",
    "pickle.dump(ordered_features, open('ordered_features', 'wb'))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Chromosome arm 2L"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "h5_2L = h5py.File('ag1000g.crosses.phase1.ar3sites.2L.h5', 'r')\n",
    "samples_hdf5 = list(map(lambda sample: sample.decode('utf-8'), h5_2L['/2L/samples']))\n",
    "calldata_DP = h5_2L['/2L/calldata/DP']\n",
    "POS = h5_2L['/2L/variants/POS']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_parent_indexes(samples_hdf5, parents_pd):\n",
    "    parents = []\n",
    "    for i, individual in parents_pd.T.iteritems():\n",
    "        index = samples_hdf5.index(individual.id)\n",
    "        parents.append(index)\n",
    "    return parents\n",
    "\n",
    "parents_pd = samples[samples['function'] == 'parent']\n",
    "parent_indexes = get_parent_indexes(samples_hdf5, parents_pd)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "all_dps = []\n",
    "for i, pos in enumerate(POS):\n",
    "    if random.random() > 0.01:\n",
    "        continue\n",
    "    pos_dp = calldata_DP[i]\n",
    "    parent_pos_dp = [pos_dp[parent_index] for parent_index in parent_indexes]\n",
    "    all_dps.append(parent_pos_dp + [pos])\n",
    "all_dps = np.array(all_dps)\n",
    "np.save(gzip.open('DP_2L.npy.gz', 'wb'), all_dps, allow_pickle=False, fix_imports=False)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
